The purpose of this script is to generate a heatmap of normalized enrichment scores of the 50 “hallmark pathways” in the MSigDB gene collection. According to description on the molecular signature database site, “Hallmark gene sets are coherently expressed signatures derived by aggregating many MSigDB gene sets to represent well-defined biological states or processes.” We’ll use these to get a sense of the overall global pathway changes occurring in our RNA seq samples compared to a PBS control.
The input needed are the differential expression files (‘SE_2h_over_PBS_2h_diffexpress.csv’) and the hallmark gene set file (h.all.v6.2.entrez.gmt).
This script was adapted from a previous version written by David Hill.
library(clusterProfiler)
library(dplyr)
library(ggplot2)
library(GSEABase)
library(here)
library(knitr)
library(magrittr)
library(org.Hs.eg.db)
library(plotly)
library(RColorBrewer)
library(ReactomePA)
library(rmarkdown)
library(stringi)
# Directory for output (in results folder)
dir.create(path = here("results/DESeq2_human/GSEA/hallmark/"),
recursive = TRUE)This analysis is based on running gene set enrichment using differential expression files as input. These are the steps:
We’ll write a loop to iterate these steps over each differential expression file and combine results together.
# load hallmark gene set
hall <- read.gmt(gmtfile = here("data/h.all.v6.2.symbols.gmt"))
# Define list of diff expression files to run
files <- list.files(here('results/DESeq2_human/diff_expression/'))
files## [1] "SE_2h_over_PBS_2h_diffexpress.csv"
## [2] "SE_8h_over_PBS_8h_diffexpress.csv"
## [3] "SPI1_2h_over_PBS_2h_diffexpress.csv"
## [4] "SPI1_8h_over_PBS_8h_diffexpress.csv"
## [5] "SPI2_2h_over_PBS_2h_diffexpress.csv"
## [6] "SPI2_8h_over_PBS_8h_diffexpress.csv"
## [7] "ST_2h_over_PBS_2h_diffexpress.csv"
## [8] "ST_8h_over_PBS_8h_diffexpress.csv"
## [9] "STM_2h_over_PBS_2h_diffexpress.csv"
## [10] "STM_8h_over_PBS_8h_diffexpress.csv"
# Create empty object to put full data into
full.data <- NULL
# Iterate over all files - run hallmark GSEA, save output
for(i in seq(files)){
# Sample being analyzed
sample <- gsub('_diffexpress.csv','',files[i])
# Load diff expression file, sort by descending fold change
input <- read.csv(file = here('results/DESeq2_human/diff_expression',files[i])) %>%
arrange(-log2FoldChange)
# Create vector of descending list of gene changes, named by symbol
up.list <- input$log2FoldChange
names(up.list) <- input$symbol
# REACTOME GSEA
gmt.gsea <- GSEA(geneList = up.list,
nPerm = 1000,
minGSSize = 10,
pvalueCutoff = 1,
verbose = F,
TERM2GENE = hall)
# Convert to dataframe, add columns with sample labels
temp <- as.data.frame(gmt.gsea) %>%
mutate(pathway = gsub('_',' ',.$ID),
sample = sample,
label = gsub('_.*','',files[i]),
time = stri_extract_first_regex(files[i], '[0-9]h'))
# Remove 'HALLMARK' from pathway column
temp$pathway <- gsub('HALLMARK ', '', temp$pathway)
# Save results
out.dir <- here('results/DESeq2_human/GSEA/hallmark/')
write.csv(temp,
file = paste0(out.dir,'GSEA_hallmark_',sample,'.csv'))
# Combine into full dataframe
full.data <- rbind(full.data, temp)
# Monitor progress
print(paste(i, 'of', length(files),'done:',files[i] ))
}## [1] "1 of 10 done: SE_2h_over_PBS_2h_diffexpress.csv"
## [1] "2 of 10 done: SE_8h_over_PBS_8h_diffexpress.csv"
## [1] "3 of 10 done: SPI1_2h_over_PBS_2h_diffexpress.csv"
## [1] "4 of 10 done: SPI1_8h_over_PBS_8h_diffexpress.csv"
## [1] "5 of 10 done: SPI2_2h_over_PBS_2h_diffexpress.csv"
## [1] "6 of 10 done: SPI2_8h_over_PBS_8h_diffexpress.csv"
## [1] "7 of 10 done: ST_2h_over_PBS_2h_diffexpress.csv"
## [1] "8 of 10 done: ST_8h_over_PBS_8h_diffexpress.csv"
## [1] "9 of 10 done: STM_2h_over_PBS_2h_diffexpress.csv"
## [1] "10 of 10 done: STM_8h_over_PBS_8h_diffexpress.csv"
# Save the full data
write.csv(full.data, here('results/DESeq2_human/GSEA/hallmark/GSEA_hallmark_all.csv'))Now that we have all the GSEA normalized enrichment scores for all samples in a single dataframe, we can make a heatmap to visualize changes in pathways. We’ll use ggplot to make a heatmap (using geom_tile()) of the normalized enrichment scores (NES) and split the 2h and 8h samples apart vertically. The pathways will be in descending order with the most enriched pathways at the top and the least enriched at the bottom.
# Set sample order for heatmap
full.data$label <- factor(full.data$label,
levels = c('SE','ST','STM','SPI1','SPI2'))
# Set order for heatmap - pathways by descending average NES
NES.avg <- group_by(full.data, pathway) %>%
summarise(NES_avg = mean(NES)) %>%
arrange(NES_avg)
full.data$pathway <- factor(full.data$pathway, levels = NES.avg$pathway)
# Round NES (remove decimals for ggplotly tooltip)
full.data$NES <- round(full.data$NES, 2)
# Set up palette of colors for heatmap
hm.palette <- colorRampPalette(rev(brewer.pal(11, 'RdYlBu')), space='Lab')# Heatmap
hallmark_plot <- function(input){
ggplot(input, aes(label, pathway)) +
geom_tile(aes(fill = NES), colour = "white") +
scale_fill_gradientn(colors = hm.palette(100))+
theme(axis.text.x = element_text(vjust = 1, hjust = 1, angle = 45),
strip.text.x = element_text(size = 12, face = 'bold'),
strip.text.y = element_text(size = 12, face = 'bold'),
axis.title = element_text(size = 14, face = "bold"),
axis.text = element_text(size = 10),
legend.text = element_text(size = 12),
legend.title = element_text(size = 12, face = 'bold'))+
ggtitle('GSEA hallmark gene set enrichment')+
labs(x='', y = 'Pathway')+
facet_grid(cols = vars(time), scales = 'free')
}
p <- hallmark_plot(full.data)
pggplotly(p)png(filename = here("/img/GSEA_hallmark_heatmap.png"),
width =7, height = 8.5, units = 'in', res = 300)
p
dev.off()The paper will be split into sections and so we’ll split the heatmap into two groups of samples:
# Select samples for heatmap
muts <- filter(full.data, label %in% c('STM', 'SPI1', 'SPI2'))# Heatmap
p1 <- hallmark_plot(muts)
p1# Select samples
ser <- filter(full.data, label %in% c('STM','SE','ST'))
p2 <- hallmark_plot(ser)
p2png(filename = here("/img/stm_mutants/mut_GSEA_hallmark_heatmap.png"),
width =6.5, height = 8.5, units = 'in', res = 300)
p1
dev.off()
png(filename = here("/img/serovars/ser_GSEA_hallmark_heatmap.png"),
width =6.5, height = 8.5, units = 'in', res = 300)
p2
dev.off()